title: āMixture Modelsā author: āAbdul-Rahmanā date: ā10/10/2019ā output: pdf_document: default html_document: default
library("ggplot2")
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("mixtools")
## mixtools package, version 1.1.0, Released 2017-03-10
## This package is based upon work supported by the National Science Foundation under Grant No. SES-0518772.
library("mosaics")
## Loading required package: Rcpp
library("mosaicsExample")
library("HistData")
library("bootstrap")
library("ggbeeswarm")
library("RColorBrewer")
library("ggthemes")
library("magrittr")
library("plotly")
##
## Attaching package: 'plotly'
## The following object is masked from 'package:mosaics':
##
## export
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library("pheatmap")
library("colorspace")
library("grid")
##
## Attaching package: 'grid'
## The following object is masked from 'package:mixtools':
##
## depth
library("ggbio")
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter,
## Find, get, grep, grepl, intersect, is.unsorted, lapply, Map,
## mapply, match, mget, order, paste, pmax, pmax.int, pmin,
## pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
## setdiff, sort, table, tapply, union, unique, unsplit, which,
## which.max, which.min
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## Need specific help about ggbio? try mailing
## the maintainer or visit http://tengfei.github.com/ggbio/
##
## Attaching package: 'ggbio'
## The following objects are masked from 'package:ggplot2':
##
## geom_bar, geom_rect, geom_segment, ggsave, stat_bin,
## stat_identity, xlim
library("GenomicRanges")
## Loading required package: stats4
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:plotly':
##
## rename
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:plotly':
##
## slice
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## Loading required package: GenomeInfoDb
library("flexmix")
## Loading required package: lattice
library("mclust")
## Package 'mclust' version 5.4.5
## Type 'citation("mclust")' for citing this R package in publications.
##
## Attaching package: 'mclust'
## The following object is masked from 'package:bootstrap':
##
## diabetes
## The following object is masked from 'package:mixtools':
##
## dmvnorm
library("mixtools")
library("mosaics")
library("mosaicsExample")
library("here")
## here() starts at /Users/abdul-rahmanbukari/Documents/BIOSTATS
library("HistData")
library("bootstrap")
library("vcd")
##
## Attaching package: 'vcd'
## The following object is masked from 'package:GenomicRanges':
##
## tile
## The following object is masked from 'package:IRanges':
##
## tile
Generate a random number from a normal distribution with mean 3 and variance 0.25
set.seed(1234)
coinflips = (runif(10000) > 0.5)
table(coinflips)
## coinflips
## FALSE TRUE
## 4990 5010
oneFlip = function(fl, mean1 = 1, mean2 = 3, sd1 = 0.5, sd2 = 0.5) {
if (fl) {
rnorm(1, mean1, sd1)
} else {
rnorm(1, mean2, sd2)
}
}
fairmix = vapply(coinflips, oneFlip, numeric(1))
ggplot(tibble(value = fairmix), aes(x = value)) +
geom_histogram(fill = "purple", binwidth = 0.1)
We can use Rās vectorized syntax to remove the vapply-loop and generate the fairmix vector more efficiently.
means = c(1, 3)
sds = c(0.25, 0.25)
values = rnorm(length(coinflips),
mean = ifelse(coinflips, means[1], means[2]),
sd = ifelse(coinflips, sds[1], sds[2]))
ggplot(tibble(value = values), aes(x = value)) +
geom_histogram(fill = "purple", binwidth = 0.1)
What if we make one million coin flips and make a histogram with 500 bins
fair = tibble(
coinflips = (runif(1e6) > 0.5),
values = rnorm(length(coinflips),
mean = ifelse(coinflips, means[1], means[2]),
sd = ifelse(coinflips, sds[1], sds[2])))
ggplot(fair, aes(x = values)) +
geom_histogram(fill = "purple", bins = 500)
The histogram gets nearer to a smooth curve called the density function which offers a better approximation of the number of mixtures.
Compare the values of fair$values for which coinflips is TRUE to density function for a normal random variable.
ggplot(dplyr::filter(fair, coinflips), aes(x = values)) +
geom_histogram(aes(y = ..density..), fill = "purple",
binwidth = 0.01) +
stat_function(fun = dnorm,
args = list(mean = means[1], sd = sds[1]), color = "red")
Our distributions so far can be matematically represented as the sum of the density distributions
We can use this concept to generate a graph for two component mixture model with extremely visible distributions (using our previous mean and standard deviations) and minimal overlap.
fairtheory = tibble(
x = seq(-1, 5, length.out = 1000),
f = 0.5 * dnorm(x, mean = means[1], sd = sds[1]) +
0.5 * dnorm(x, mean = means[2], sd = sds[2]))
ggplot(fairtheory, aes(x = x, y = f)) +
geom_line(color = "red", size = 1.5) + ylab("mixture density")
What if we had a figure generated by the code chunk below (means= 1 and 2; std= sqrt(.5) and sqrt(.5) ) without afore knowledge of the code. That is we form a hard-to -esolve distribution as the means of the individual distributions gets closer and the variance increases.
set.seed(1234)
mystery = tibble(
coinflips = (runif(1e3) > 0.5),
values = rnorm(length(coinflips),
mean = ifelse(coinflips, 1, 2),
sd = ifelse(coinflips, sqrt(.5), sqrt(.5))))
br2 = with(mystery, seq(min(values), max(values), length.out = 30))
ggplot(mystery, aes(x = values)) +
geom_histogram(fill = "purple", breaks = br2)
head(mystery, 3)
## # A tibble: 3 x 2
## coinflips values
## <lgl> <dbl>
## 1 FALSE 2.70
## 2 TRUE 0.134
## 3 TRUE 1.50
br = with(mystery, seq(min(values), max(values), length.out = 30))
ggplot(mystery, aes(x = values)) +
geom_histogram(data = dplyr::filter(mystery, coinflips),
fill = "red", alpha = 0.2, breaks = br) +
geom_histogram(data = dplyr::filter(mystery, !coinflips),
fill = "darkblue", alpha = 0.2, breaks = br)
#This code separtes the distributions and treats overlaps as belonging to separate groups. This is evident from the heights of the bars.
We can now plot all the āmysteryā graphs in layers over each other
ggplot(mystery, aes(x = values, fill = coinflips)) +
geom_histogram(data = dplyr::filter(mystery, coinflips),
fill = "red", alpha = 0.2, breaks = br) +
geom_histogram(data = dplyr::filter(mystery, !coinflips),
fill = "darkblue", alpha = 0.2, breaks = br) +
geom_histogram(fill = "purple", breaks = br, alpha = 0.2)
The EM algorithm is used to infer the value of hidden groupings. It does so by; alternating between:
- Pretending to know the probability that each observation belongs to a component (and estimating the distribution paramaters of these components)
- Pretending to know the distribution parameters of each component (and estimating the probability that each observation belongs to a given component)
probHead = c(0.125, 0.25)
for (pi in c(1/8, 1/4)) {
whCoin = sample(2, 100, replace = TRUE, prob = c(pi, 1-pi))
K = rbinom(length(whCoin), size = 2, prob = probHead[whCoin])
print(table(K))
}
## K
## 0 1 2
## 62 34 4
## K
## 0 1 2
## 67 30 3
For example a variable Y is measured on a series of objects that come from two groups (however we do not know which group each object belongs to). The goal is to estimate the parameters that make the data Y This can be done by;
- Augmenting the data with a latent (unobserved/hidden) variable, U.this resuls in a bivariate distribution because we are looking at the distribution of ācouplesā (Y,U) - Adopting the Maximum Likelihood approach to find the values of U and the unknown parameters of underlying densities.
Scenario 1 Suppose we have two unfair coins, whose probabilities of heads are p1=0.125 and p2=0.25. With probability pi we pick coin 1, with probability = 1āpi, coin 2. We then toss that coin twice and record the number of heads K. a) Simulate 100 instances of this procedure, with pi=1/8, and compute the contingency table of K. b) Do the same with pi=1/4. c) Can the afore mentioned parameters parameters from the contingency table?
Solution
probHead = c(0.125, 0.25)
set.seed(1234)
for (pi in c(1/8, 1/4)) {
whCoin = sample(2, 100, replace = TRUE, prob = c(pi, 1-pi))
K = rbinom(length(whCoin), size = 2, prob = probHead[whCoin])
print(table(K))
}
## K
## 0 1 2
## 55 33 12
## K
## 0 1 2
## 60 29 11
The parameter are difficult to estimate due to the numerous degrees of freedom.
Scenario 2 Estimating the parameters of a mixture of two normals with mean parameters unkown and standard deviation equal to 1.
# generate model with R using the labels "u"
#randomly sample 1 or 2, 100 times, with replacement
mus = c(-0.5, 1.5)
set.seed(1234)
u = sample(2, 100, replace = TRUE)
#create a random normal distribution of 100 numbers with mean equal to -0.5(1) or 1.5(2)
y = rnorm(length(u), mean = mus[u])
duy = tibble(u, y)
head(duy)
## # A tibble: 6 x 2
## u y
## <int> <dbl>
## 1 2 -0.306
## 2 2 0.918
## 3 2 0.391
## 4 2 0.485
## 5 1 -0.662
## 6 2 2.06
Since we know the labels āuā, we can estimate the means by using seperate ML equations for each group.
duy %>% group_by(u) %>% summarise(mean(y))
## # A tibble: 2 x 2
## u `mean(y)`
## <int> <dbl>
## 1 1 -0.479
## 2 2 1.62
EMtest <- Mclust(duy$y, nclass=2)
EMtest$n
## [1] 100
EMtest$df
## [1] 2
EMtest$BIC
## Bayesian Information Criterion (BIC):
## E V
## 1 -360.6557 -360.6557
## 2 -367.9925 -367.3822
## 3 -371.8851 -382.3664
## 4 -375.8227 -386.7824
## 5 -383.3537 -398.3807
## 6 -388.2105 -403.3627
## 7 -397.4478 -414.0851
## 8 -406.6570 -427.8954
## 9 -415.9232 -436.2490
##
## Top 3 models based on the BIC criterion:
## E,1 V,1 V,2
## -360.6557 -360.6557 -367.3822
EMtest$classification
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
head(duy$u,20)
## [1] 2 2 2 2 1 2 1 1 1 2 2 2 2 1 2 2 2 1 2 2
gm <- normalmixEM(duy$y, k=2, lambda=c(0.5,0.5), mu=c(-0.01,0.01), sigma=c(1,1))
## number of iterations= 563
gm$lambda
## [1] 0.09610953 0.90389047
gm$mu
## [1] -1.6326458 0.9920098
gm$sigma
## [1] 0.3038369 1.2265384
gm$loglik
## [1] -172.1765
Ecological and molecular data often come in the form of counts. Such as the count data in RNA seq. Such data can often be seen as a mixture of two scenarios: a. If gene is not expressed the count is zero b. It gene is expressed the number of transcripts in different individuals may differ
The R packages pscl and zicounts provide many examples and functions for working with zero inflated counts.
Demonstration with chromatin immunoprecipitation (ChIP) data
constructBins(infile = system.file(file.path("extdata", "wgEncodeSydhTfbsGm12878Stat1StdAlnRep1_chr22_sorted.bam"), package="mosaicsExample"),
fileFormat = "bam", outfileLoc = here("Book", "data"),
byChr = FALSE, useChrfile = FALSE, chrfile = NULL, excludeChr = NULL,
PET = FALSE, fragLen = 200, binSize = 200, capping = 0)
## ------------------------------------------------------------
## Info: setting summary
## ------------------------------------------------------------
## Name of aligned read file: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/mosaicsExample/extdata/wgEncodeSydhTfbsGm12878Stat1StdAlnRep1_chr22_sorted.bam
## Aligned read file format: BAM
## Directory of processed bin-level files: /Users/abdul-rahmanbukari/Documents/BIOSTATS/Book/data
## Construct bin-level files by chromosome? N
## Is file for chromosome info provided? N
## Data type: Single-end tag (SET)
## Average fragment length: 200
## Bin size: 200
## ------------------------------------------------------------
## Use the provided BAM index file.
## Chromosome information is extracted from the BAM file.
## Info: reading the aligned read file and processing it into bin-level files...
## Info: done!
## ------------------------------------------------------------
## Info: processing summary
## ------------------------------------------------------------
## Directory of processed bin-level files: /Users/abdul-rahmanbukari/Documents/BIOSTATS/Book/data
## Processed bin-level file: wgEncodeSydhTfbsGm12878Stat1StdAlnRep1_chr22_sorted.bam_fragL200_bin200.txt
## Sequencing depth: 231822
## ------------------------------------------------------------
constructBins(infile = system.file(file.path("extdata", "wgEncodeSydhTfbsGm12878InputStdAlnRep1_chr22_sorted.bam"), package="mosaicsExample"),
fileFormat = "bam", outfileLoc = here("Book", "data"),
byChr = FALSE, useChrfile = FALSE, chrfile = NULL, excludeChr = NULL,
PET = FALSE, fragLen = 200, binSize = 200, capping = 0)
## ------------------------------------------------------------
## Info: setting summary
## ------------------------------------------------------------
## Name of aligned read file: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/mosaicsExample/extdata/wgEncodeSydhTfbsGm12878InputStdAlnRep1_chr22_sorted.bam
## Aligned read file format: BAM
## Directory of processed bin-level files: /Users/abdul-rahmanbukari/Documents/BIOSTATS/Book/data
## Construct bin-level files by chromosome? N
## Is file for chromosome info provided? N
## Data type: Single-end tag (SET)
## Average fragment length: 200
## Bin size: 200
## ------------------------------------------------------------
## Use the provided BAM index file.
## Chromosome information is extracted from the BAM file.
## Info: reading the aligned read file and processing it into bin-level files...
## Info: done!
## ------------------------------------------------------------
## Info: processing summary
## ------------------------------------------------------------
## Directory of processed bin-level files: /Users/abdul-rahmanbukari/Documents/BIOSTATS/Book/data
## Processed bin-level file: wgEncodeSydhTfbsGm12878InputStdAlnRep1_chr22_sorted.bam_fragL200_bin200.txt
## Sequencing depth: 182605
## ------------------------------------------------------------
datafiles = c(here("Book","data","wgEncodeSydhTfbsGm12878Stat1StdAlnRep1_chr22_sorted.bam_fragL200_bin200.txt"), here("Book", "data","wgEncodeSydhTfbsGm12878InputStdAlnRep1_chr22_sorted.bam_fragL200_bin200.txt"))
binTFBS = readBins(type = c("chip","input"), fileName = datafiles)
## Info: reading and preprocessing bin-level data...
## Info: data contains only one chromosome.
## Info: done!
binTFBS
## Summary: bin-level data (class: BinData)
## ----------------------------------------
## - # of chromosomes in the data: 1
## - total effective tag counts: 462479
## (sum of ChIP tag counts of all bins)
## - control sample is incorporated
## - mappability score is NOT incorporated
## - GC content score is NOT incorporated
## - uni-reads are assumed
## ----------------------------------------
bincts = print(binTFBS)
ggplot(bincts, aes(x = tagCount)) +
geom_histogram(binwidth = 1, fill = "forestgreen")
It can be observed that there are many zeros, although from this plot it is not immediately obvious whether the number of zeros is really extraordinary, given the frequencies of the other small numbers Letās edo the histogram of the counts using a logarithm base 10 scale on the y and estimate Ļ0, the proportion of bins with zero counts.
ggplot(bincts, aes(x = tagCount)) + scale_y_log10() +
geom_histogram(binwidth = 1, fill = "forestgreen")
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 17 rows containing missing values (geom_bar).
###Mixtures greater than two Example: What sort of histogram do we get when weighing 7,000 canonical nucleotides (A,T,G,C) each type has a different weight, measured with the same standard deviation sd=3.
masses = c(A = 331, C = 307, G = 347, T = 322)
probs = c(A = 0.12, C = 0.38, G = 0.36, T = 0.14)
N = 7000
sd = 3
nuclt = sample(length(probs), N, replace = TRUE, prob = probs)
quadwts = rnorm(length(nuclt),
mean = masses[nuclt],
sd = sd)
ggplot(tibble(quadwts = quadwts), aes(x = quadwts)) +
geom_histogram(bins = 100, fill = "purple")
What happens if we repeat this exercise with N=1000 nucleotide measurements?
N = 1000
sd = 3
set.seed(1234)
nuclt = sample(length(probs), N, replace = TRUE, prob = probs)
quadwts = rnorm(length(nuclt),
mean = masses[nuclt],
sd = sd)
ggplot(tibble(quadwts = quadwts), aes(x = quadwts)) +
geom_histogram(bins = 100, fill = "purple")
The middle components have become less distinct seeminly indicating just 3 components. this technically an overlap of the mixtures. Lesson: Increasing the sequence length enhances the dinctinctivness of the components.
What happens if we keep N=7000 measurements but make the standard deviation 10?
N = 7000
sd = 10
set.seed(1234)
nuclt = sample(length(probs), N, replace = TRUE, prob = probs)
quadwts = rnorm(length(nuclt),
mean = masses[nuclt],
sd = sd)
ggplot(tibble(quadwts = quadwts), aes(x = quadwts)) +
geom_histogram(bins = 100, fill = "purple")
Now the individual components are even less distinct. It looks like there are only 2 components within our 4 component data. Because increasing the standard deviation results in mass overlap.
Thus as we have enough measurements with good enough precision, we are able to distinguish the four nucleotides and decompose the distribution.
An example is the Darwinās Zea mays data where he compared heights of 15 self-hybridized and 15 self-crossed plants as example.Here we will consider extreme case of mitxure models, where the number of observatins is equal to the number of mixture components. An example is the Darwinās Zea mays data where he compared heights of 15 self-hybridized and 15 self-crossed plants as example.
ZeaMays$diff
## [1] 6.125 -8.375 1.000 2.000 0.750 2.875 3.500 5.125 1.750 3.625
## [11] 7.000 3.000 9.375 7.500 -6.000
ggplot(ZeaMays, aes(x = diff, ymax = 1/15, ymin = 0)) +
geom_linerange(size = 1, col = "forestgreen") + ylim(0, 0.1)
We can bootstrap to estimate the sampling distribution of the median of the Zea Mays differences.
set.seed(1234)
B = 1000
meds = replicate(B, {
i = sample(15, 15, replace = TRUE)
median(ZeaMays$diff[i])
})
ggplot(tibble(medians = meds), aes(x = medians)) +
geom_histogram(bins = 30, fill = "purple")
We can also estimate a 99% confidence interval for the median based on these simulations. The quantile function can be used for this.
quantile(meds,c(0.005,0.995))
## 0.5% 99.5%
## 0.99875 6.12500
Use the bootstrap package to redo the same analysis using the function bootstrap for both median and mean. What differences do you notice between the sampling distribution of the mean and the median?
boot_means=bootstrap(ZeaMays$diff, B, mean)
boot_medians=bootstrap(ZeaMays$diff, B, median)
ggplot(tibble(boot_means$thetastar),aes(x = boot_means$thetastar))+
geom_histogram(bins = 30, fill = "purple")
ggplot(tibble(boot_medians$thetastar),aes(x = boot_medians$thetastar))+
geom_histogram(bins = 30, fill = "purple")
The bootstrap uses a mixture with n components, so with a sample of size n, it qualifies as a nonparametric method. If the sample is composed of n=3 different values, how many different bootstrap resamples are possible? Answer the same question with n = 15.
c(N3 = choose(5, 3), N15 = choose(29, 15))
## N3 N15
## 10 77558760
###Infinite Mixtures
##Infinite mixture of normals
This is when the number of mixture components is greater than or equal to the number of observations.
Letās simulate an infinite mixture using a 2 level of data generation technique. In level 1, we generate 10000 random deviates (w) from the exponential distribution with rate (ie. mean) equal to 1. In level 2, the deviates (w) serve as the variances of normal variables with mean μ generated using rnorm. We finally visualize the data.
w = rexp(10000, rate = 1)
mu = 0.3
lps = rnorm(length(w), mean = mu, sd = sqrt(w))
ggplot(data.frame(lps), aes(x = lps)) +
geom_histogram(fill = "purple", binwidth = 0.1)
This is called a Laplace distribution. It is useful because the median is a good estimator of its location parameter Īø. and that the its scale parameter Ļ can be estimated from the median absolute deviation.
###Asymmetric Laplace In the Laplace distribution, the variances of the normal components depend on W, while the means are unaffected. Lets generate and visualize data from an asymmetric Laplace distribution.
mu = 0.3; sigma = 0.4; theta = -1
w = rexp(10000, 1)
alps = rnorm(length(w), theta + mu * w, sigma * sqrt(w))
ggplot(tibble(alps), aes(x = alps)) +
geom_histogram(fill = "purple", binwidth = 0.1)
modelling a real-world count data requires a two-level hierarchical model where simple Poisson and binomial distributions serve as the building blocks at the lower level. Examples of sampling schemes that are well modeled by mixtures of Poisson variables include RNA-Seq, and 16S rRNA-Seq data used in microbial ecology.
###Gamma distribution: two parameters (shape and scale) The gamma distribution is an extension of the (one-parameter) exponential distribution, but it has two parameters, which makes it more flexible. It is often useful as a building block for the upper level of a hierarchical model.
Example gamma distributions
set.seed(1234)
ggplot(tibble(x = rgamma(10000, shape = 2, rate = 1/3)),
aes(x = x)) + geom_histogram(bins = 100, fill= "purple")
ggplot(tibble(x = rgamma(10000, shape = 10, rate = 3/2)),
aes(x = x)) + geom_histogram(bins = 100, fill= "purple")
Example gamma-Poisson hierachical model
lambda = rgamma(10000, shape = 10, rate = 3/2)
gp = rpois(length(lambda), lambda = lambda)
ggplot(tibble(x = gp), aes(x = x)) +
geom_histogram(bins = 100, fill= "purple")
The Gamma-Poisson distribution is a discrete distribution. Gamma-Poisson distribution is also called negative binomial distribution. Formula for equation is similar to binomial distribution Gamma-Poisson is more defining of the underlying principles
ofit = goodfit(gp, "nbinomial")
plot(ofit, xlab = "")
ofit$par
## $size
## [1] 9.970598
##
## $prob
## [1] 0.5998435
###Variance stabilizing transformations Transformations are important in reducing the heterogeneity and heteroscedasticity of the data. The goal is to stabilze the variance. Instability in the variance is a source of noise. Example of simulated data before and after transformation
set.seed(1234)
lambdas = seq(100, 900, by = 100)
simdat = lapply(lambdas, function(l)
tibble(y = rpois(n = 40, lambda=l), lambda = l)
) %>% bind_rows
#data before transformation
ggplot(simdat, aes(x = lambda, y = y)) +
geom_beeswarm(alpha = 0.6, color = "purple")
#data after square root transformation
ggplot(simdat, aes(x = lambda, y = sqrt(y))) +
geom_beeswarm(alpha = 0.6, color = "purple")
.o = options(digits = 3)
summarise(group_by(simdat, lambda), sd(y), sd(2*sqrt(y)))
## # A tibble: 9 x 3
## lambda `sd(y)` `sd(2 * sqrt(y))`
## <dbl> <dbl> <dbl>
## 1 100 10.1 0.996
## 2 200 15.4 1.06
## 3 300 15.5 0.903
## 4 400 18.3 0.913
## 5 500 23.6 1.05
## 6 600 23.6 0.959
## 7 700 25.6 0.965
## 8 800 31.2 1.10
## 9 900 32.9 1.10
options(.o)
Repeat the computation in the above code chunk for a version of simdat with a larger number of replicates than 40.
set.seed(1234)
lambdas = seq(100, 900, by = 100)
simdat2 = lapply(lambdas, function(l)
tibble(y = rpois(n = 1000, lambda=l), lambda = l)
) %>% bind_rows
#data before transformation
ggplot(simdat2, aes(x = lambda, y = y)) +
geom_beeswarm(alpha = 0.6, color = "purple")
#data after square root transformation
ggplot(simdat2, aes(x = lambda, y = sqrt(y))) +
geom_beeswarm(alpha = 0.6, color = "purple")
Now Using the gamma-Poisson distribution and and plotting the 95% confidence intervals around the mean.
muvalues = 2^seq(0, 10, by = 1)
simgp = lapply(muvalues, function(mu) {
u = rnbinom(n = 1e4, mu = mu, size = 4)
tibble(mean = mean(u), sd = sd(u),
lower = quantile(u, 0.025),
upper = quantile(u, 0.975),
mu = mu)
} ) %>% bind_rows
head(as.data.frame(simgp), 2)
## mean sd lower upper mu
## 1 1.0043 1.130048 0 4 1
## 2 2.0171 1.711286 0 6 2
ggplot(simgp, aes(x = mu, y = mean, ymin = lower, ymax = upper)) +
geom_point() + geom_errorbar()
To transform these data to normalize their standard deviation, divide the values that correspond to mu[1] (and which are centered around simgp\(mean[1]) by their standard deviation simgp\)sd[1], the values that correspond to mu[2] (and which are centered around simgp\(mean[2]) by their standard deviation simgp\)sd[2], and so on, then the resulting values will have, by construction, a standard deviation (and thus variance) of 1.
simgp = mutate(simgp,
slopes = 1 / sd,
trsf = cumsum(slopes * mean))
ggplot(simgp, aes(x = mean, y = trsf)) +
geom_point() + geom_line() + xlab("")
##Exercises
##Excersise 1 The EM algorithm step by step Model the Myst.rds dataset as a mixture of two normal distributions (A and B) with unknown means and standard deviations.
yvar = readRDS(here("Book","data","Myst.rds"))$yvar
ggplot(tibble(yvar), aes(x = yvar)) + geom_histogram(binwidth=0.025)
str(yvar)
## num [1:1800] 0.3038 0.0596 -0.0204 0.1849 0.2842 ...
Create the Distributions with the code chunck below.
#First, simulate a random uniform distribution with the length of yvar (1800) yielding entry probabilities of the data points of the component
pA = runif(length(yvar))
#Make complementary propabilities of pA
pB = 1 - pA
#housekeeping variables: iter counts over the iterations of the EM algorithm; loglik stores the current log-likelihood; delta stores the change in the log-likelihood from the previous iteration to the current one. We also define the parameters tolerance, miniter and maxiter of the algorithm.
iter = 0
loglik = -Inf
delta = +Inf
tolerance = 1e-3
miniter = 50; maxiter = 1000
while((delta > tolerance) && (iter <= maxiter) || (iter < miniter)) {
#Provide an initial guess of the expected parameters
lambda = mean(pA)
muA = weighted.mean(yvar, pA)
muB = weighted.mean(yvar, pB)
sdA = sqrt(weighted.mean((yvar - muA)^2, pA))
sdB = sqrt(weighted.mean((yvar - muB)^2, pB))
phiA = dnorm(yvar, mean = muA, sd = sdA)
phiB = dnorm(yvar, mean = muB, sd = sdB)
pA = lambda * phiA
pB = (1 - lambda) * phiB
ptot = pA + pB
pA = pA / ptot
pB = pB / ptot
loglikOld = loglik
loglik = sum(log(pA))
delta = abs(loglikOld - loglik)
iter = iter + 1
}
param = tibble(group = c("A","B"), mean = c(muA,muB), sd = c(sdA,sdB))
param
## # A tibble: 2 x 3
## group mean sd
## <chr> <dbl> <dbl>
## 1 A 0.147 0.150
## 2 B -0.169 0.0983
iter
## [1] 359
Algorithm for EM is i. Given a set of incomplete data, consider a set of starting parameters. ii. Expectation step (E ā step): Using the observed available data of the dataset, estimate (guess) the values of the missing data.This is represented in the code as; lambda = mean(pA) >muA = weighted.mean(yvar, pA) >muB = weighted.mean(yvar, pB) >sdA = sqrt(weighted.mean((yvar - muA)^2, pA)) >sdB = sqrt(weighted.mean((yvar - muB)^2, pB))
Maximization step (M ā step): Complete data generated after the expectation (E) step is used in order to update the parameters.
d.Why do we need to compute loglik? loglik estimation is relevant for determining convergence. It is used in estimating delta which is in turn compared with the predefined tolerance argument for assessing convergence.
mixtoolsQ <- normalmixEM(yvar, k=2,lambda=c(0.5,0.5), mu=c(muA,muB), sigma=c(sdA,sdB), maxit=1000)
## number of iterations= 149
mixtoolsQ$mu
## [1] 0.1473373 -0.1693509
mixtoolsQ$sigma
## [1] 0.14977994 0.09826952
mixtoolsQ$loglik
## [1] 417.2218
The results are similar. However the normalmixEM function acheives the optimal loglik at a constantly lower iteration (149).
##Excersise 2 Compare the theoretical values of the gamma-Poisson distribution with parameters given by the estimates in ofit$par in Section 4.4.3 to the data used for the estimation using a QQ-plot.
#Obtain quantiles of the two heirachy simulated data (gp). But first use ppoints() to find the ordinates for probability ploting
ordinates= ppoints(1000)
quant_raw=quantile(gp,ordinates)
#Generate theoretical values of the gamma-Poisson distributions (using parameters of ofit$par) and obtain their quantiles with the same ordinates
gamma_pois = qnbinom(p=ordinates,size=ofit$par[[1]], prob=ofit$par[[2]])
quant_gamma_pois=quantile(gamma_pois,ordinates)
# plot the two for comparison
plot(quant_raw,quant_gamma_pois)
abline(a =0, b=1, col="red")
There is a high similarity among both datasets.
##Excersice 3
#load data, view and plot plot NPreg data
data("NPreg")
head(NPreg)
## x yn yp yb class id1 id2
## 1 4.176633 22.380379 4 0 1 1 1
## 2 1.201631 5.111575 3 0 1 1 1
## 3 2.295006 13.251058 9 0 1 2 1
## 4 5.965868 30.285240 3 1 1 2 1
## 5 2.358083 14.764508 4 0 1 3 2
## 6 7.637061 42.833760 1 1 1 3 2
summary(NPreg)
## x yn yp yb
## Min. :0.06295 Min. :-1.482 Min. : 0.000 Min. :0.000
## 1st Qu.:2.67892 1st Qu.:20.489 1st Qu.: 2.000 1st Qu.:0.000
## Median :5.07820 Median :30.933 Median : 4.000 Median :0.000
## Mean :5.03078 Mean :28.607 Mean : 3.955 Mean :0.475
## 3rd Qu.:7.37686 3rd Qu.:38.322 3rd Qu.: 5.000 3rd Qu.:1.000
## Max. :9.89474 Max. :53.487 Max. :12.000 Max. :1.000
##
## class id1 id2
## Min. :1.0 1 : 2 1 : 4
## 1st Qu.:1.0 2 : 2 2 : 4
## Median :1.5 3 : 2 3 : 4
## Mean :1.5 4 : 2 4 : 4
## 3rd Qu.:2.0 5 : 2 5 : 4
## Max. :2.0 6 : 2 6 : 4
## (Other):188 (Other):176
#There is an independent varianble (x) and three dependent variables following the Normal(yn), Poisson(yp) and Binomial(yb) Regression and two classes.
ggplot(NPreg, aes(x = x, y = yn, color=as.factor(class))) + geom_point()
ggplot(NPreg, aes(x = x, y = yp, color=as.factor(class))) + geom_point()
ggplot(NPreg, aes(x = x, y = yb, color=as.factor(class))) + geom_point()
Of all the plots, the normal regression (yn) is easily distiguishable by class. Class 1 seems to follow a linear regression while class two follows a non-linear (possibly quadratic) regression.
m1 = flexmix(yn ~ x + I(x^2), data = NPreg, k = 2)
#this is a two component model. We can easily observe the cluster sizes, convergence iterations by running the object. More info can be obtained by summary
m1
##
## Call:
## flexmix(formula = yn ~ x + I(x^2), data = NPreg, k = 2)
##
## Cluster sizes:
## 1 2
## 100 100
##
## convergence after 15 iterations
summary(m1)
##
## Call:
## flexmix(formula = yn ~ x + I(x^2), data = NPreg, k = 2)
##
## prior size post>0 ratio
## Comp.1 0.494 100 145 0.690
## Comp.2 0.506 100 141 0.709
##
## 'log Lik.' -642.5453 (df=9)
## AIC: 1303.091 BIC: 1332.775
#We can also observe the associated parameter of the componets by running parameters() and selecting the appropraite component
parameters(m1, component = 1)
## Comp.1
## coef.(Intercept) -0.20959655
## coef.x 4.81774867
## coef.I(x^2) 0.03616097
## sigma 3.47652037
parameters(m1, component = 2)
## Comp.2
## coef.(Intercept) 14.7171140
## coef.x 9.8466333
## coef.I(x^2) -0.9683531
## sigma 3.4796257
# we can now create a contingency table using the class and clusters
table(NPreg$class, clusters(m1))
##
## 1 2
## 1 95 5
## 2 5 95
summary(m1)
##
## Call:
## flexmix(formula = yn ~ x + I(x^2), data = NPreg, k = 2)
##
## prior size post>0 ratio
## Comp.1 0.494 100 145 0.690
## Comp.2 0.506 100 141 0.709
##
## 'log Lik.' -642.5453 (df=9)
## AIC: 1303.091 BIC: 1332.775
The summary provides information on the log Lik (-642.5453) degreed of freedom (df=9) AIC: 1303.091 and BIC: 1332.776. The prior and size poterior of both classes are also indicated.
ggplot(NPreg, aes(x = x, y = yn, group = class)) +
geom_point(aes(colour = as.factor(m1@cluster), shape = factor(class)))
Other hierarchical noise models: Find two papers that explore the use of other infite mixtures for modeling molecular biology technological variation.
Yau, C., Papaspiliopoulos, O., Roberts, G.O. and Holmes, C., 2011. Bayesian nonāparametric hidden Markov models with applications in genomics. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 73(1), pp.37-57.
Dang, T. and Kishino, H., 2019. Stochastic variational inference for Bayesian phylogenetics: A case of CAT model. Molecular biology and evolution, 36(4), pp.825-833.